Load packages and data
library("phyloseq"); packageVersion("phyloseq")
## [1] '1.30.0'
data(GlobalPatterns)
library("ggplot2"); packageVersion("ggplot2")
## [1] '3.3.2'
library("plyr"); packageVersion("plyr")
## [1] '1.8.6'
Change theme for all future plots to theme_bw
theme_set(theme_bw())
There are tools in the phyloseq package for preprocessing data. You can see examples and details of preprocessing in this dedicated preprocessing tutorial.
When preprocessing you must think hard about the preprocessing that you do and be able to explain why you did them. The steps you took during preprocessing should be clearly documented.
We will preprocess by first filtering low-occurrence, poorly-represented OTUs from the data. Below, the OTUs are removed and indicated not to appear more than 5 times in more than half the samples.
GP = GlobalPatterns
wh0 = genefilter_sample(GP, filterfun_sample(function(x) x > 5), A=0.5*nsamples(GP))
GP1 = prune_taxa(wh0, GP)
Next, we will transform the data to even the sampling depth.
GP1 = transform_sample_counts(GP1, function(x) 1E6 * x/sum(x))
Next, we keep only the top five most abundant phyla.
phylum.sum = tapply(taxa_sums(GP1), tax_table(GP1)[, "Phylum"], sum, na.rm=TRUE)
top5phyla = names(sort(phylum.sum, TRUE))[1:5]
GP1 = prune_taxa((tax_table(GP1)[, "Phylum"] %in% top5phyla), GP1)
The preprocessing still leaves us with 204 OTUs. A major problem here is that some are human-associated microbiomes and some are not. Next, we will define human-associated versus non-human.
human = get_variable(GP1, "SampleType") %in% c("Feces", "Mock", "Skin", "Tongue")
sample_data(GP1)$human <- factor(human)
The plot_ordination() function supports four basic representations of an ordination.
We’ll start by plotting just the OTUs and coloring the points by Phylum.
GP.ord <- ordinate(GP1, "NMDS", "bray")
## Square root transformation
## Wisconsin double standardization
## Run 0 stress 0.1333468
## Run 1 stress 0.1556499
## Run 2 stress 0.1333468
## ... New best solution
## ... Procrustes: rmse 9.624058e-07 max resid 2.600389e-06
## ... Similar to previous best
## Run 3 stress 0.1391747
## Run 4 stress 0.1558954
## Run 5 stress 0.1333468
## ... Procrustes: rmse 2.017668e-06 max resid 5.68365e-06
## ... Similar to previous best
## Run 6 stress 0.1657681
## Run 7 stress 0.1448832
## Run 8 stress 0.144908
## Run 9 stress 0.1333468
## ... New best solution
## ... Procrustes: rmse 3.443168e-06 max resid 9.890168e-06
## ... Similar to previous best
## Run 10 stress 0.1519457
## Run 11 stress 0.1333468
## ... Procrustes: rmse 1.390775e-06 max resid 3.62952e-06
## ... Similar to previous best
## Run 12 stress 0.1487053
## Run 13 stress 0.1529609
## Run 14 stress 0.1612617
## Run 15 stress 0.1333468
## ... Procrustes: rmse 1.645777e-05 max resid 4.777624e-05
## ... Similar to previous best
## Run 16 stress 0.146027
## Run 17 stress 0.1333468
## ... Procrustes: rmse 7.584126e-06 max resid 2.2631e-05
## ... Similar to previous best
## Run 18 stress 0.1333468
## ... Procrustes: rmse 4.249897e-06 max resid 1.245356e-05
## ... Similar to previous best
## Run 19 stress 0.1333468
## ... Procrustes: rmse 1.8177e-06 max resid 4.433006e-06
## ... Similar to previous best
## Run 20 stress 0.1687343
## *** Solution reached
p1 = plot_ordination(GP1, GP.ord, type="taxa", color="Phylum", title="taxa")
print(p1)
This is a complicated looking plot, which is not necessarily a good thing. We can use facet_wrap() function to simplify the plot.
p1 + facet_wrap(~Phylum, 3)
Next, we’ll plot only the samples and color the points by sample type while also modifying the shape according to whether they are human-associated or not.
p2 = plot_ordination(GP1, GP.ord, type="samples", color="SampleType", shape="human")
p2 + geom_polygon(aes(fill=SampleType)) + geom_point(size=5) + ggtitle("samples")
The plot_ordination() function also allows you to be able to plot both samples and OTUs on the same graph.
p3 = plot_ordination(GP1, GP.ord, type="biplot", color="SampleType", shape="Phylum", title="biplot")
# Some stuff to modify the automatic shape scale
GP1.shape.names = get_taxa_unique(GP1, "Phylum")
GP1.shape <- 15:(15 + length(GP1.shape.names) - 1)
names(GP1.shape) <- GP1.shape.names
GP1.shape["samples"] <- 16
p3 + scale_shape_manual(values=GP1.shape)
And, of course, we can simplify this graph (reduce occlusion) by using the argument type=“split” instead of the facet_wrap approach.
p4 = plot_ordination(GP1, GP.ord, type="split", color="Phylum", shape="human", label="SampleType", title="split")
p4
Here we try different method parameter options to the plot_ordination() function.
dist = "bray"
ord_meths = c("DCA", "CCA", "RDA", "DPCoA", "NMDS", "MDS", "PCoA")
plist = llply(as.list(ord_meths), function(i, physeq, dist){
ordi = ordinate(physeq, method=i, distance=dist)
plot_ordination(physeq, ordi, "samples", color="SampleType")
}, GP1, dist)
## Square root transformation
## Wisconsin double standardization
## Run 0 stress 0.1333468
## Run 1 stress 0.1510167
## Run 2 stress 0.146027
## Run 3 stress 0.1719635
## Run 4 stress 0.1465222
## Run 5 stress 0.1333468
## ... New best solution
## ... Procrustes: rmse 1.369236e-06 max resid 3.112657e-06
## ... Similar to previous best
## Run 6 stress 0.1448829
## Run 7 stress 0.1824453
## Run 8 stress 0.1689756
## Run 9 stress 0.1499795
## Run 10 stress 0.1477687
## Run 11 stress 0.1657187
## Run 12 stress 0.146027
## Run 13 stress 0.2704251
## Run 14 stress 0.1627645
## Run 15 stress 0.1471619
## Run 16 stress 0.1658276
## Run 17 stress 0.1475593
## Run 18 stress 0.1499234
## Run 19 stress 0.1660648
## Run 20 stress 0.1333468
## ... Procrustes: rmse 2.003992e-06 max resid 5.035845e-06
## ... Similar to previous best
## *** Solution reached
Store the ordination methods plot results in a list.
names(plist) <- ord_meths
Next, we extract the data from each of the individual plots and put the extracted data into one big dataframe so that we can include all plots in one graphic.
pdataframe = ldply(plist, function(x){
df = x$data[, 1:2]
colnames(df) = c("Axis_1", "Axis_2")
return(cbind(df, x$data))
})
names(pdataframe)[1] = "method"
Next, we can use our dataframe to make a standard faceted ggplot scatterplot.
p = ggplot(pdataframe, aes(Axis_1, Axis_2, color=SampleType, shape=human, fill=SampleType))
p = p + geom_point(size=4) + geom_polygon()
p = p + facet_wrap(~method, scales="free")
p = p + scale_fill_brewer(type="qual", palette="Set1")
p = p + scale_colour_brewer(type="qual", palette="Set1")
p
To replot a larger version of an individual plot, we can print the plist from which the pdataframe was made. Here, we plot the second element in the list.
plist[[2]]
We can add some additional layers to improve the aesthetics of the plot.
p = plist[[2]] + scale_colour_brewer(type="qual", palette="Set1")
p = p + scale_fill_brewer(type="qual", palette="Set1")
p = p + geom_point(size=5) + geom_polygon(aes(fill=SampleType))
p
Next, we can use the ordinate() function to simultaneously perform weighted UniFrac and perform a Principal Component Analysis on the distance matrix. Then, we can pass the data and the ordination results to plot_ordination to create a default ggplot2 graphic.
ordu = ordinate(GP1, "PCoA", "unifrac", weighted=TRUE)
plot_ordination(GP1, ordu, color="SampleType", shape="human")
Then, we add layers to improve the aesthetics of the plot.
p = plot_ordination(GP1, ordu, color="SampleType", shape="human")
p = p + geom_point(size=7, alpha=0.75)
p = p + scale_colour_brewer(type="qual", palette="Set1")
p + ggtitle("MDS/PCoA on weighted-UniFrac distance, GlobalPatterns")
library("phyloseq"); packageVersion("phyloseq")
## [1] '1.30.0'
data("GlobalPatterns")
library("ggplot2"); packageVersion("ggplot2")
## [1] '3.3.2'
theme_set(theme_bw())
pal = "Set1"
scale_colour_discrete <- function(palname=pal, ...){
scale_colour_brewer(palette=palname, ...)
}
scale_fill_discrete <- function(palname=pal, ...){
scale_fill_brewer(palette=palname, ...)
}
GP <- prune_species(speciesSums(GlobalPatterns) > 0, GlobalPatterns)
## Warning: 'prune_species' is deprecated.
## Use 'prune_taxa' instead.
## See help("Deprecated") and help("phyloseq-deprecated").
## Warning: 'speciesSums' is deprecated.
## Use 'taxa_sums' instead.
## See help("Deprecated") and help("phyloseq-deprecated").
plot_richness(GP)
plot_richness(GP, measures=c("Chao1", "Shannon"))
plot_richness(GP, x="SampleType", measures=c("Chao1", "Shannon"))
sampleData(GP)$human <- getVariable(GP, "SampleType") %in% c("Feces", "Mock", "Skin", "Tongue")
## Warning: 'getVariable' is deprecated.
## Use 'get_variable' instead.
## See help("Deprecated") and help("phyloseq-deprecated").
## Warning: 'sampleData' is deprecated.
## Use 'sample_data' instead.
## See help("Deprecated") and help("phyloseq-deprecated").
## Warning: 'sampleData<-' is deprecated.
## Use 'sample_data<-' instead.
## See help("Deprecated") and help("phyloseq-deprecated").
plot_richness(GP, x="human", color="SampleType", measures=c("Chao1", "Shannon"))
GPst = merge_samples(GP, "SampleType")
# repair variables that were damaged during merge (coerced to numeric)
sample_data(GPst)$SampleType <- factor(sample_names(GPst))
sample_data(GPst)$human <- as.logical(sample_data(GPst)$human)
p = plot_richness(GPst, x="human", color="SampleType", measures=c("Chao1", "Shannon"))
p + geom_point(size=5, alpha=0.7)
p$layers
## [[1]]
## geom_point: na.rm = TRUE
## stat_identity: na.rm = TRUE
## position_identity
##
## [[2]]
## mapping: ymax = ~value + se, ymin = ~value - se
## geom_errorbar: na.rm = FALSE, orientation = NA, width = 0.1, width = 0.1, flipped_aes = FALSE
## stat_identity: na.rm = FALSE
## position_identity
p$layers <- p$layers[-1]
p + geom_point(size=5, alpha=0.7)
library("phyloseq"); packageVersion("phyloseq")
## [1] '1.30.0'
library("ggplot2"); packageVersion("ggplot2")
## [1] '3.3.2'
theme_set(theme_bw())
data("GlobalPatterns")
gpt <- subset_taxa(GlobalPatterns, Kingdom=="Bacteria")
gpt <- prune_taxa(names(sort(taxa_sums(gpt),TRUE)[1:300]), gpt)
plot_heatmap(gpt, sample.label="SampleType")
## Warning: Transformation introduced infinite values in discrete y-axis
gpac <- subset_taxa(GlobalPatterns, Phylum=="Crenarchaeota")
plot_heatmap(gpac)
## Warning: Transformation introduced infinite values in discrete y-axis
(p <- plot_heatmap(gpac, "NMDS", "bray", "SampleType", "Family"))
## Warning: Transformation introduced infinite values in discrete y-axis
p$scales$scales[[1]]$name <- "My X-Axis"
p$scales$scales[[2]]$name <- "My Y-Axis"
print(p)
## Warning: Transformation introduced infinite values in discrete y-axis
plot_heatmap(gpac, "NMDS", "bray", "SampleType", "Family", low="#000033", high="#CCFF66")
## Warning: Transformation introduced infinite values in discrete y-axis
plot_heatmap(gpac, "NMDS", "bray", "SampleType", "Family", low="#000033", high="#FF3300")
## Warning: Transformation introduced infinite values in discrete y-axis
plot_heatmap(gpac, "NMDS", "bray", "SampleType", "Family", low="#000033", high="#66CCFF")
## Warning: Transformation introduced infinite values in discrete y-axis
plot_heatmap(gpac, "NMDS", "bray", "SampleType", "Family", low="#66CCFF", high="#000033", na.value="white")
## Warning: Transformation introduced infinite values in discrete y-axis
plot_heatmap(gpac, "NMDS", "bray", "SampleType", "Family", low="#FFFFCC", high="#000033", na.value="white")
## Warning: Transformation introduced infinite values in discrete y-axis
plot_heatmap(gpac, "NMDS", "jaccard")
## Warning: Transformation introduced infinite values in discrete y-axis
plot_heatmap(gpac, "DCA", "none", "SampleType", "Family")
## Warning: Transformation introduced infinite values in discrete y-axis
plot_heatmap(gpac, "RDA", "none", "SampleType", "Family")
## Warning: Transformation introduced infinite values in discrete y-axis
plot_heatmap(gpac, "PCoA", "bray", "SampleType", "Family")
## Warning: Transformation introduced infinite values in discrete y-axis
plot_heatmap(gpac, "PCoA", "unifrac", "SampleType", "Family")
## Warning: Transformation introduced infinite values in discrete y-axis
plot_heatmap(gpac, "MDS", "unifrac", "SampleType", "Family", weighted=TRUE)
## Warning: Transformation introduced infinite values in discrete y-axis
heatmap(otu_table(gpac))
library(phyloseq); packageVersion("phyloseq")
## [1] '1.30.0'
packageVersion("ggplot2")
## [1] '3.3.2'
data(enterotype)
set.seed(711L)
enterotype = subset_samples(enterotype, !is.na(Enterotype))
plot_net(enterotype, maxdist = 0.4, point_label = "Sample_ID")
plot_net(enterotype, maxdist = 0.3, color = "SeqTech", shape="Enterotype")
ig <- make_network(enterotype, max.dist=0.3)
plot_network(ig, enterotype)
## Warning: attributes are not identical across measure variables; they will be
## dropped
plot_network(ig, enterotype, color="SeqTech", shape="Enterotype", line_weight=0.4, label=NULL)
## Warning: attributes are not identical across measure variables; they will be
## dropped
ig <- make_network(enterotype, max.dist=0.2)
plot_network(ig, enterotype, color="SeqTech", shape="Enterotype", line_weight=0.4, label=NULL)
## Warning: attributes are not identical across measure variables; they will be
## dropped
ig <- make_network(enterotype, dist.fun="bray", max.dist=0.3)
plot_network(ig, enterotype, color="SeqTech", shape="Enterotype", line_weight=0.4, label=NULL)
## Warning: attributes are not identical across measure variables; they will be
## dropped